'''
Filename: forecast_returns_qje.py

Authors: Carolin Pflueger, Emil Siriwardane, and Adi Sunderam
Affiliation: University of British Columbia, Harvard Business School

Description: Runs forecasting regressions with Hodrick (1992) standard errors.  
The returns that we forecast are:
    1. Annual returns on the volatility-sorted portfolio
    2. Annual excess returns on the CRSP-VW Index
'''

##  -----------------------------------------------------------------------
##                                import
##  -----------------------------------------------------------------------

from __future__ import print_function
from __future__ import division
import sys
import datetime
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from numpy.linalg import inv 


##  -----------------------------------------------------------------------
##                               functions
##  -----------------------------------------------------------------------
def hodrick(y, x, k, scale = 100, return_vcv = False):
    r'''
    Forecasting regression with overlapping returns.  Standard errors computed 
    using Hodrick (1992).

    Args:
        y:  DataFrame.  Contains one-period arithmatic returns (k = 1).
        x:  DataFrame.  Contains predictor variable
        k:  Int. Number of periods to aggregate returns over, for k > 1
        scale: Number to scale returns by.  Default is 100
        return_vcv: Boolean.  Indicates whether to return full VCV matrix.

        *Note that the function will automatically aggregate returns and line up
        dates properly

    Returns:
        k_step_reg: Contains regression results with uncorrected standard errors


    '''
    # == Get information on x variables == #
    if type(x) == pd.core.series.Series:
        if x.name == None:
            x.name = 'x'
        x = pd.DataFrame(x.values, columns = [x.name], index = x.index)
        xcols = x.columns
    else:
        xcols = x.columns

    # == Aggregate returns == #
    y.name = None
    yk = y.rolling(min_periods = k, window = k).apply(\
                   lambda x: np.prod(1 + x) - 1)

    # == Shift returns to match x, i.e. x(1) lines up with y(2), etc. == # 
    yf1 = pd.DataFrame(y.shift(-1), columns = ['f1']) * scale
    yfk = pd.DataFrame(yk.shift(-k), columns = ['fk']) * scale

    # == Merge with x variables == #
    ad = pd.merge(pd.merge(x, yf1, left_index = True, right_index = True), 
                  yfk, left_index = True, right_index = True).dropna()
    T = len(ad)

    # == Run one period ahead regression and get residuals == #
    one_step_reg = sm.OLS(ad['f1'], sm.add_constant(ad[xcols]), 
                          hasconst = True).fit()
    ef1 = np.matrix(one_step_reg.resid.values).T

    # == Run k-period ahead regression == #
    reg_string = "fk ~ " + " + ".join(xcols)
    k_step_reg = smf.ols(reg_string, ad).fit()

    #k_step_reg = sm.OLS(ad['fk'], sm.add_constant(ad[xcols]), 
    #                      hasconst = True).fit()
    efk = np.matrix(k_step_reg.resid.values).T

    # == Add constant to X == #
    Xc = np.matrix(sm.add_constant(ad[xcols]).values)
    numX = np.shape(Xc)[1]

    # == Compute Exx' == #
    Exx = np.zeros((numX, numX))
    for t in range(T):
        Exx = Exx + Xc[t, :].T.dot(Xc[t, :])
    Exx = Exx / T

    # == Compute Hodrick (1992) standard errors == #
    S = np.zeros((numX, numX))
    for t in range(k - 1, T):
        wk = ef1[t] * Xc[t - k + 1: t + 1, :].sum(0)
        S = S + wk.T.dot(wk)
    S = S / T 

    vcv = inv(Exx).dot(S).dot(inv(Exx)) / T
    se = np.sqrt(np.diag(vcv))


    # == Return HAC results as well == #
    hac = k_step_reg.get_robustcov_results(cov_type = 'HAC', maxlags = k + 1, 
                                           use_correction = True)

    if return_vcv:
        return hac, k_step_reg.params / se, vcv, ad
    else:
        return hac, k_step_reg.params / se

def print_helper(tstat_df, reg_params):
    r''' Helper function to print out corrected t-stats'''

    names = tstat_df.index
    values = tstat_df.values
    ses = reg_params / values

    txt = ""
    for i in range(len(names)):
        txt = txt + "%s:  %.3f (%.2f, %.2f) \n" %(names[i], reg_params[i], values[i], ses[i])

    return txt
#  -----------------------------------------------------------------------
##                             main program
##  -----------------------------------------------------------------------
def main(argv = None):

    # == Parameters == #
    pss_min_date = datetime.datetime(1970, 4, 1)
    pss_max_date = datetime.datetime(2016, 6, 30)    
    rr_var = 'ldt_real_rate_1yr'

    # == Read in monthly FF data == #
    ff = pd.read_csv("F-F_Research_Data_Factors.csv",
                     header = 2, nrows = 1122)
    ff['Date'] = pd.to_datetime(ff['Unnamed: 0'].astype('str'), 
                                 format = "%Y%m")
    ff.drop('Unnamed: 0', 1, inplace = True)
    ff.rename(columns = {"Mkt-RF":"MktRf"}, inplace = True)
    ff.set_index('Date',inplace = True)
    ff = ff[((ff.index >= pss_min_date) & (ff.index <= pss_max_date))]

    #Quarterly FF
    ffq_all = ff.rolling(min_periods = 3, window = 3).apply(\
                    lambda x: np.prod(1 + x / 100) - 1)
    ffq_all = ffq_all.resample('Q').last()    
    ffq = ffq_all[['MktRf', 'SMB', 'HML']]

    #Annual FF
    ffa = ff.rolling(min_periods = 12, window = 12).apply(\
                    lambda x: np.prod(1 + x/100) - 1)
    ffa = ffa[['MktRf', 'SMB', 'HML']]
    ffa = ffa.resample('Q', how = 'last')
    ffa_ahead = ffa.shift(-4)
    
    # == Read in Vol-BM spread and real rate == #
    spread_rr = pd.read_stata("return_forecasting.dta")
    spread_rr = spread_rr[['yq', rr_var, 'tvol_BM_Star_ew_spr',
                            'tvol_BM_Star_ew_ret']]
    
    spread_rr.set_index('yq', inplace =True)
    spread_rr = spread_rr.resample('Q').last()
    spread_rr.index.name = "Date"
    
    #Standardize PVS to have mean zero and variance one
    spread_rr['tvol_BM_Star_ew_spr'] = (spread_rr['tvol_BM_Star_ew_spr'] - \
                                 spread_rr['tvol_BM_Star_ew_spr'].mean()) / \
                                        spread_rr['tvol_BM_Star_ew_spr'].std()

    lmh = spread_rr['tvol_BM_Star_ew_ret'] / 100

    # == Regress annual portfolio returns on PVS == #
    reg, t = hodrick(lmh, spread_rr['tvol_BM_Star_ew_spr'], 4)
    print(reg.summary(yname = "Annual VOL Returns"))
    print("\nHodrick Corrected tstats and SEs:")
    print(print_helper(t, reg.params) + "\n \n")


    # == Regress annual portfolio returns on real rate == #
    reg, t = hodrick(lmh, spread_rr[rr_var], 4)
    print(reg.summary(yname = "Annual VOL Returns"))
    print("\nHodrick Corrected tstats and SEs:")
    print(print_helper(t, reg.params) + "\n \n")

    # == Regress annual Mkt-Rf returns on PVS == #
    reg, t = hodrick(ffq['MktRf'], spread_rr['tvol_BM_Star_ew_spr'], 4)
    print(reg.summary(yname = "Annual Mkt Returns"))
    print("\nHodrick Corrected tstats and SEs:")
    print(print_helper(t, reg.params) + "\n \n")

    # == Regress annual Mkt-Rf returns on real rate == #
    reg, t = hodrick(ffq['MktRf'], spread_rr[rr_var], 4)
    print(reg.summary(yname = "Annual Mkt Returns"))
    print("\nHodrick Corrected tstats and SEs:")
    print(print_helper(t, reg.params) + "\n \n")


if __name__ == "__main__": sys.exit(main(sys.argv))



